package org.bao.netcrawler.utils;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 
 * filter all the non-related HTML tags
 * @author Bao<bb@feijiao.info>
 * @since 2010-12-24
 * @version 0.9.0
 * @license GPLv3
 */
public class HTMLFilter {
	public static String filter(String content){
		//filter script tag
		String reg_script = "<script[^>]*?>[\\s\\S]*?<\\/script>";
		
		//filter style/css tag
		String reg_style = "<style[^>]*?>[\\s\\S]*?<\\/style>";
		
		//filter html tags
		String reg_html = "<[^>]+>";
		
		Pattern pattern = null;
		Matcher matcher = null;
		
		
		pattern = Pattern.compile(reg_script, Pattern.CASE_INSENSITIVE);
		matcher = pattern.matcher(content);
		content = matcher.replaceAll("");
		
		pattern = Pattern.compile(reg_style, Pattern.CASE_INSENSITIVE);
		matcher = pattern.matcher(content);
		content = matcher.replaceAll("");
		
		pattern = Pattern.compile(reg_html, Pattern.CASE_INSENSITIVE);
		matcher = pattern.matcher(content);
		
		content = matcher.replaceAll("");
		
		return content.replaceAll("\\s+", "");
	}
}
